library(RColorBrewer)
library(scales)
library(ISLR)
library(MASS)
library(mvtnorm)
cols <- brewer.pal(9, "Set1")
Let us generate some data: it is the very lazy way of verifying that the implementation of a model is correct.
set.seed(1234)
n <- 500
X <- rnorm(n, 3, 5)
eta <- 2 - 0.3 * X
We will also need a function that computes the logit transformation \[F(\eta) = \dfrac{1}{1 + e^{-\eta}}\]
logit <- function(x){
return (1/(1 + exp(-x)))
}
We can then generate the data with the implied probabilities
Y <- array(NA, dim = n)
for (i in 1:n){
p_i <- logit(eta[i])
Y[i] <- sample(0:1, 1, TRUE, c(1 - p_i, p_i))
}
par(mar=c(4,4,2,2), family = 'serif')
plot(X, jitter(rep(0, n)), pch = 16, col = alpha(cols[Y + 1], 0.6),
xlab = 'X', ylab = '', ylim = c(0, 1))
legend('topright', legend = c('0','1'), lwd = 2, col = cols[1:2])
We fit the logistic regression
fit_glm <- glm(Y ~ X, family = 'binomial')
round(coef(fit_glm), 3)
(Intercept) X
2.036 -0.286
The coefficients are very close to the ground truth. Things seem to be working! Let us make prediction.
X_gr <- data.frame(X = seq(min(X), max(X), length.out = 100))
p_gr <- as.numeric(predict(fit_glm, newdata = X_gr, type = "response"))
par(mar=c(4,4,2,2), family = 'serif')
plot(X, jitter(rep(0, n)), pch = 16, col = alpha(cols[Y + 1], 0.6),
xlab = 'X', ylab = '', ylim = c(0, 1))
lines(X_gr$X, p_gr, lwd = 2)
legend('topright', legend = c('0','1','P(Y = 1)'), lwd = 2, col = c(cols[1:2], 'black'))
We can also add the horizontal line corresponding to probabilities equal to \(50\%\). This allows us to show the decision boundary, i.e. the point at which the prediction switches from one class to another.
boundary <- X_gr$X[which.min(abs(p_gr - 0.5))]
par(mar=c(4,4,2,2), family = 'serif')
plot(X, jitter(rep(0, n)), pch = 16, col = alpha(cols[Y + 1], 0.6), xlab = 'X', ylab = '', ylim = c(0, 1))
lines(X_gr$X, p_gr, lwd = 2)
abline(h = 0.5, lwd = 2, lty = 3)
abline(v = boundary, lwd = 2, lty = 3)
legend('topright', legend = c('0','1','P(Y = 1)'), lwd = 2, col = c(cols[1:2], 'black'))
Let us load and visualize the Default
data.
rm(list=ls())
cols <- brewer.pal(9, "Set1")
data("Default")
head(Default)
default student balance income
1 No No 729.5265 44361.625
2 No Yes 817.1804 12106.135
3 No No 1073.5492 31767.139
4 No No 529.2506 35704.494
5 No No 785.6559 38463.496
6 No Yes 919.5885 7491.559
par(mar=c(4,4,2,2), family = 'serif')
plot(Default$balance, Default$income, pch = 16,
col = alpha(cols[Default$default], 0.6),
xlab = 'balance', ylab = 'income')
The class of people who do not default is way more representative (~ \(97\%\) of the observations). It is hard to spot any patterns since the red points are covering the blue points. We can subsample only some of the red points and show a balanced dataset where the proportion of defaults vs. non-defaults is \(50\%-50\%\).
set.seed(123)
idx_resampl <- c(which(Default$default == 'Yes'), sample(which(Default$default == 'No'), sum(Default$default == 'Yes'), replace = FALSE)) # concatenate all defaults with subsample of non defaults
Default_pl <- Default[idx_resampl,]
par(mar=c(4,4,2,2), family = 'serif')
plot(Default_pl$balance, Default_pl$income, pch = 16, col = alpha(cols[Default_pl$default], 0.6),
xlab = 'balance', ylab = 'income')
We can show the marginal distributions for the predictors used in the model, stratified by outcome variable.
par(mar=c(4,4,2,2), mfrow = c(1,2), family = 'serif')
boxplot(balance ~ default, data = Default, col = cols[1:2], pch = 16)
boxplot(income ~ default, data = Default, col = cols[1:2], pch = 16)
A first approach could be to use a simple linear model.
\[\mathbb{P}(y_i = 1 \mid x_i)=\textbf{x}_i^{\intercal} \mathbf{\beta}\]
This is called the linear probability model: the probability of a “yes” outcome (\(y = 1\)) is linear in \(\textbf{x}\).
Default$numeric_def <- as.numeric(Default$default) - 1
fit_lm <- lm(numeric_def ~ balance + income, data = Default)
summary(fit_lm)
Call:
lm(formula = numeric_def ~ balance + income, data = Default)
Residuals:
Min 1Q Median 3Q Max
-0.24607 -0.06989 -0.02663 0.02005 0.98554
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.224e-02 5.788e-03 -15.936 < 2e-16 ***
balance 1.318e-04 3.514e-06 37.511 < 2e-16 ***
income 4.605e-07 1.274e-07 3.613 0.000304 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.168 on 9997 degrees of freedom
Multiple R-squared: 0.1237, Adjusted R-squared: 0.1236
F-statistic: 705.8 on 2 and 9997 DF, p-value: < 2.2e-16
After we obtain these “probabilities”, we can impute the response to be 1 if, for example, the probability is larger than \(0.5\).
Y_hat <- as.numeric(predict(fit_lm, newdata = Default))
Y_hat <- ifelse(Y_hat > 0.5, 1, 0)
conf_mat <- table(pred = Y_hat, true = Default$numeric_def)
sum(diag(conf_mat))/sum(conf_mat) # in-sample accuracy
[1] 0.9667
The in sample performance seems excellent! How well are we doing? Note that \(96.67\%\) of the training set is not default. Thus “not default” is the most likely outcome. So a reasonable baseline or “naive model” is one that guesses “not default” for every training set instance.
How well does this null model (or naive classifier) perform on the training set? It is about \(96.67\%\), since it gets all the 0’s right and all the 1’s wrong.
table(Default$numeric_def)
0 1
9667 333
max(table(Default$numeric_def))/sum(table(Default$numeric_def)) # in sample accuracy of naive classifier
[1] 0.9667
The mystery is solved: our linear probability model is predicting always \(0\), so it is not smarter than guessing “not default” for every instance.
The linear probability model has one obvious problem: it can produce fitted probabilities that fall outside \((0,1)\). This is just wrong, you should never use a linear model to predict probabilities! Let us learn something more correct: the logistic regression.
Now we have \[\mathbb{P}(y_i = 1 \mid \textbf{x}_i)=F(\textbf{x}_i^\intercal \mathbf{\beta}) = \dfrac{e^{\textbf{x}_i^\intercal \mathbf{\beta}}}{1 + e^{\textbf{x}_i^\intercal \mathbf{\beta}}} = \dfrac{1}{1 + e^{-\textbf{x}_i^\intercal \mathbf{\beta}}}.\]
fit_glm <- glm(numeric_def ~ balance + income, data = Default, family = 'binomial')
round(coef(fit_glm), 3)
(Intercept) balance income
-11.540 0.006 0.000
Recall our model is
\[\text{odds}=\dfrac{p}{1−p} = e^{\beta_0} e^{\beta_1 x_1} \dots e^{\beta_p x_p}\]
So \(e^{\beta_j}\) is an odds multiplier or odds ratio for for a one-unit increase in the feature \(x_j\), keeping all the others fixed.
The \(\beta\) for balance
is \(0.006\). So having an \(100\$\) extra on the bank account multiplies odds of default by \(e^{0.6} \approx 1.82\), all others factors being equal.
Let us visualize the decision boundary.
balance_gr <- seq(min(Default$balance), max(Default$balance), length.out = 100)
income_gr <- seq(min(Default$income), max(Default$income), length.out = 100)
X_gr <- expand.grid(balance = balance_gr, income = income_gr)
p_gr <- as.numeric(predict(fit_glm, newdata = X_gr, type = "response"))
par(mar=c(4,4,2,2), family = 'serif')
plot(Default_pl$balance, Default_pl$income, pch = 16, col = alpha(cols[Default_pl$default], 0.6), xlab = 'balance', ylab = 'income')
contour(x = balance_gr, y = income_gr,
z = matrix(p_gr, length(balance_gr), length(income_gr), byrow = F),
levels = 0.5, lwd = 2, add = TRUE)
Remember: we have way more data than the ones that I am showing you!
par(mar=c(4,4,2,2), family = 'serif')
plot(Default$balance, Default$income, pch = 16, col = alpha(cols[Default$default], 0.6), xlab = 'balance', ylab = 'income')
contour(x = balance_gr, y = income_gr,
z = matrix(p_gr, length(balance_gr), length(income_gr), byrow = F),
levels = 0.5, lwd = 2, add = TRUE)
We can also calculate the accuracy.
p_hat <- as.numeric(predict(fit_glm, newdata = Default, type = "response"))
Y_hat <- ifelse(p_hat > 0.5, 1, 0)
conf_mat <- table(pred = Y_hat, true = Default$numeric_def)
conf_mat
true
pred 0 1
0 9629 225
1 38 108
sum(diag(conf_mat))/sum(conf_mat) # in-sample accuracy
[1] 0.9737
That is a sad conclusion, we are not improving by much. But at least now we are getting some of those positive outcomes! Also remember that we can actually change the thresholding value!
We show here how the true positives and true negatives evolve for different values of the threshold \(s\). The code is not displayed here, I leave it as an exercise (it will be requested in the next homework). Hint: you just need to use a for loop and at every iteration change the value of the threshold \(s\) and compute FPR, FNR, …
ROC (receiver operator characteristics) curve: looks at the True Positives and False Positives for varying levels of \(s\).
We can use a package in order to compute the ROC curve. Try a simple Google search and you will find dozens of packages that do it. For example, let us try ROSE
.
library(ROSE)
roc.curve(Default$default, p_hat)
Area under the curve (AUC): 0.949
Let us load data containing the characteristics of three species of flowers.
rm(list=ls())
cols <- brewer.pal(7, "Set2")
data(iris)
iris <- iris[,c(1:2, 5)]
n <- nrow(iris)
par(mar=c(4,4,2,2), family = 'serif')
plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width),
col = cols[iris$Species], pch = 16,
xlab = 'Length', ylab = 'Width')
Bayes’ rule: \[\mathbb{P}(Y = k \mid \mathbf{x}) = \frac{\mathbb{P}(\mathbf{x} \mid Y = k) \mathbb{P}(Y = k)}{\mathbb{P}(\mathbf{x})}\]
\(\mathbb{P}(\mathbf{x})\) is the marginal probability of observing feature vector \(\mathbf{x}\). Notice that it does not depend on \(k\) (it is the same number for all classes). Thus, we usually write the posterior probabilities up to this constant of proportionality, without bothering to compute it: \[\mathbb{P}(Y = k \mid \mathbf{x}) \propto \mathbb{P}(\mathbf{x} \mid Y = k) \mathbb{P}(Y = k)\]
Note: often we do the actual computations on a log scale instead (for numerical stability).
The hard part is estimating the likelihood \(\mathbb{P}(\mathbf{x} \mid Y = k)\). In words: how likely is it that we would have observed feature vector \(\mathbf{x}\) if the class label were \(k\)? \[ \begin{align} \mathbb{P}(Y = k \mid \mathbf{x}) \propto \pi_{k} N(\mathbf{x} \mid \boldsymbol{\mu}_{k}, \Sigma_{k}) \end{align} \]
We could estimate the LDA/QDA parameters by hand, without having to rely on implementations existing in R
.
idx1 <- which(iris$Species == 'setosa') # indexes first class
idx2 <- which(iris$Species == 'versicolor') # indexes second class
idx3 <- which(iris$Species == 'virginica') # indexes third class
n1 <- sum(iris$Species == 'setosa') # sample size first class
n2 <- sum(iris$Species == 'versicolor') # sample size second class
n3 <- sum(iris$Species == 'virginica') # sample size third class
mu1 <- as.numeric(colMeans(iris[idx1,1:2])) # mean first class
mu2 <- as.numeric(colMeans(iris[idx2,1:2])) # mean second class
mu3 <- as.numeric(colMeans(iris[idx3,1:2])) # mean third class
Sigma1 <- as.matrix(cov(iris[idx1,1:2])) # covariance matrix first class
Sigma2 <- as.matrix(cov(iris[idx2,1:2])) # covariance matrix second class
Sigma3 <- as.matrix(cov(iris[idx3,1:2])) # covariance matrix third class
# Pooled estimate for the covariance (slide seen in class)
Sigma <- 1/(n - 3) * ((n1 - 1) * Sigma1 + (n2 - 1) * Sigma2 + (n3 - 1) * Sigma3)
Or we can simply use the R
implementation!
lda_fit <- lda(iris[,1:2], iris$Species)
lda_fit
Call:
lda(iris[, 1:2], iris$Species)
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
Sepal.Length Sepal.Width
setosa 5.006 3.428
versicolor 5.936 2.770
virginica 6.588 2.974
Coefficients of linear discriminants:
LD1 LD2
Sepal.Length -2.141178 -0.8152721
Sepal.Width 2.768109 -2.0960764
Proportion of trace:
LD1 LD2
0.9628 0.0372
lda_pred <- predict(lda_fit, iris[,1:2])
We can show the confusion matrix.
# Confusion matrix:
table(true = iris$Species, pred = lda_pred$class)
pred
true setosa versicolor virginica
setosa 49 1 0
versicolor 0 36 14
virginica 0 15 35
And the decision boundaries.
x <- seq(min(iris[,1]), max(iris[,1]), length.out = 100)
y <- seq(min(iris[,2]), max(iris[,2]), length.out = 100)
xy <- expand.grid(Sepal.Length = x, Sepal.Width = y)
z <- predict(lda_fit, xy)$post
z1 <- z[,1] - pmax(z[,2], z[,3]) # boundary of class 1
z2 <- z[,2] - pmax(z[,1], z[,3]) # boundary of class 2
z3 <- z[,3] - pmax(z[,1], z[,2]) # boundary of class 3
f1 <- matrix(dmvnorm(xy, mu1, Sigma), 100, 100)
f2 <- matrix(dmvnorm(xy, mu2, Sigma), 100, 100)
f3 <- matrix(dmvnorm(xy, mu3, Sigma), 100, 100)
par(mar=c(4,4,2,2), family = 'serif')
plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width),
col = cols[iris$Species], pch = 16,
xlab = 'Length', ylab = 'Width')
contour(x, y, matrix(z1, 100, 100), levels = 0, lwd = 2, lty = 2,
drawlabels = F, add = T)
contour(x, y, matrix(z2, 100, 100), levels = 0, lwd = 2, lty = 2,
drawlabels = F, add = T)
# contour(x, y, matrix(z3, 100, 100), levels = 0, lwd = 2, lty = 2,
# drawlabels = F, add = T)
contour(x, y, f1, col = cols[1], lwd = 2, levels = 0.3, drawlabels = F, add = T)
contour(x, y, f2, col = cols[2], lwd = 2, levels = 0.3, drawlabels = F, add = T)
contour(x, y, f3, col = cols[3], lwd = 2, levels = 0.3, drawlabels = F, add = T)
How does QDA compare with LDA? Remember the fundamental difference. We allow here the covariance matrices to be difference under the three classes. We are estimating more parameters (the model is more flexible).
qda_fit <- qda(iris[,1:2], iris$Species)
qda_fit
Call:
qda(iris[, 1:2], iris$Species)
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
Sepal.Length Sepal.Width
setosa 5.006 3.428
versicolor 5.936 2.770
virginica 6.588 2.974
qda_pred <- predict(qda_fit, iris[,1:2])
# Confusion matrix:
table(true = iris$Species, pred = qda_pred$class)
pred
true setosa versicolor virginica
setosa 49 1 0
versicolor 0 37 13
virginica 0 16 34
z <- predict(qda_fit, xy)$post
z1 <- z[,1] - pmax(z[,2], z[,3]) # boundary of class 1
z2 <- z[,2] - pmax(z[,1], z[,3]) # boundary of class 2
z3 <- z[,3] - pmax(z[,1], z[,2]) # boundary of class 3
f1 <- matrix(dmvnorm(xy, mu1, Sigma1), 100, 100)
f2 <- matrix(dmvnorm(xy, mu2, Sigma2), 100, 100)
f3 <- matrix(dmvnorm(xy, mu3, Sigma3), 100, 100)
par(mar=c(4,4,2,2), family = 'serif')
plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width),
col = cols[iris$Species], pch = 16,
xlab = 'Length', ylab = 'Width')
contour(x, y, matrix(z1, 100, 100), levels = 0, lwd = 2, lty = 2,
drawlabels = F, add = T)
contour(x, y, matrix(z2, 100, 100), levels = 0, lwd = 2, lty = 2,
drawlabels = F, add = T)
# contour(x, y, matrix(z3, 100, 100), levels = 0, lwd = 2, lty = 2,
# drawlabels = F, add = T)
contour(x, y, f1, col = cols[1], lwd = 2, levels = 0.3, drawlabels = F, add = T)
contour(x, y, f2, col = cols[2], lwd = 2, levels = 0.3, drawlabels = F, add = T)
contour(x, y, f3, col = cols[3], lwd = 2, levels = 0.3, drawlabels = F, add = T)
See how the decision boundaries changed?
Naive Bayes is very useful for high dimensional applications. Why? If we have \(p\) features, LDA and QDA would need to estimate a huge \(p \times p\) covariance matrix for the normal distribution on \(X \mid Y = k\).
Recall that \(\mathbf{x} = (x_1,x_2, \dots, x_p)\) is a vector of \(p\) features. Here our strategy for estimating \(\mathbb{P}(\mathbf{x} \mid Y = k)\) is “Naive Bayes”. It is “naive” because we make the simplifying assumption that every feature \(x_j\) is independent of all other features: \[\mathbb{P}(\mathbf{x} \mid Y = k) = \prod_{j=1}^p \mathbb{P}(x_j \mid Y = k)\]
This simplifies the requirements of the problem: just estimate the marginal distribution of the features, i.e. \[\mathbb{P}(x_j \mid Y = k)\] for all features \(j\) and classes \(k\).
In this case we can choose \[\mathbb{P}(x_j \mid Y = k) = N(x_j \mid \mu_{kj}, \sigma_{kj}^{2})\]
We first can estimate the parameters under the three classes.
idx1 <- which(iris$Species == 'setosa') # indexes first class
idx2 <- which(iris$Species == 'versicolor') # indexes second class
idx3 <- which(iris$Species == 'virginica') # indexes third class
pi1 <- sum(iris$Species == 'setosa')/n # proportion first class
pi2 <- sum(iris$Species == 'versicolor')/n # proportion second class
pi3 <- sum(iris$Species == 'virginica')/n # proportion third class
mu1 <- as.numeric(colMeans(iris[idx1,1:2])) # mean first class
mu2 <- as.numeric(colMeans(iris[idx2,1:2])) # mean second class
mu3 <- as.numeric(colMeans(iris[idx3,1:2])) # mean third class
sigma1 <- diag(apply(iris[idx1,1:2], 2, var)) # diagonal covariance matrix first class
sigma2 <- diag(apply(iris[idx2,1:2], 2, var)) # diagonal covariance matrix second class
sigma3 <- diag(apply(iris[idx3,1:2], 2, var)) # diagonal covariance matrix third class
We can then use the dmvnorm
function to evaluate the log posteriors. Remember that \[\log (\text{posterior}) \propto \log (\text{likelihood}) + \log (\text{prior})\]
post1_prop <- as.numeric(dmvnorm(iris[,1:2], mean = mu1, sigma = sigma1, log = T)) + log(pi1)
post2_prop <- as.numeric(dmvnorm(iris[,1:2], mean = mu2, sigma = sigma2, log = T)) + log(pi2)
post3_prop <- as.numeric(dmvnorm(iris[,1:2], mean = mu3, sigma = sigma3, log = T)) + log(pi3)
Since those are unnormalized, we need an additional step to get actual posterior probabilities.
post1_NB <- exp(post1_prop) / (exp(post1_prop) + exp(post2_prop) + (post3_prop))
post2_NB <- exp(post2_prop) / (exp(post1_prop) + exp(post2_prop) + (post3_prop))
post3_NB <- exp(post3_prop) / (exp(post1_prop) + exp(post2_prop) + (post3_prop))
We can use these probabilities to make prediction (more on this in the next homework).
Simple example of k-nearest neighbours on Caravan data.
library(class) # to load knn
data(Caravan)
dim(Caravan)
[1] 5822 86
head(Caravan)
MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE
1 33 1 3 2 8 0 5 1 3 7
2 37 1 2 2 8 1 4 1 4 6
3 37 1 2 2 8 0 4 2 4 3
4 9 1 3 3 3 2 3 2 4 5
5 40 1 4 2 10 1 4 1 4 7
6 23 1 2 1 5 0 5 0 5 0
MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG
1 0 2 1 2 6 1 2 7 1
2 2 2 0 4 5 0 5 4 0
3 2 4 4 4 2 0 5 4 0
4 2 2 2 3 4 3 4 2 4
5 1 2 2 4 4 5 4 0 0
6 6 3 3 5 2 0 5 4 2
MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD
1 0 1 2 5 2 1 1 2 6 1
2 0 0 5 0 4 0 2 3 5 0
3 0 0 7 0 2 0 5 0 4 0
4 0 0 3 1 2 3 2 1 4 0
5 5 4 0 0 0 9 0 0 0 0
6 0 0 4 2 2 2 2 2 4 2
MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575
1 1 8 8 0 1 8 1 0 4 5
2 2 7 7 1 2 6 3 2 0 5
3 7 2 7 0 2 9 0 4 5 0
4 5 4 9 0 0 7 2 1 5 3
5 4 5 6 2 1 5 4 0 0 9
6 9 0 5 3 3 9 0 5 2 3
MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT
1 0 0 4 3 0 0 0 6 0
2 2 0 5 4 2 0 0 0 0
3 0 0 3 4 2 0 0 6 0
4 0 0 4 4 0 0 0 6 0
5 0 0 6 3 0 0 0 0 0
6 0 0 3 3 0 0 0 6 0
PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG
1 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0
PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR
1 0 5 0 0 0 0 0 0 0
2 0 2 0 0 0 0 0 2 0
3 0 2 0 0 0 0 0 1 0
4 0 2 0 0 0 0 0 0 0
5 0 6 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0
AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM
1 0 1 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0
3 0 1 0 0 0 0 0 0 0
4 0 1 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0
6 0 1 0 0 0 0 0 0 0
ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED
1 0 0 0 0 1 0 0 0 0
2 0 0 0 0 1 0 0 0 0
3 0 0 0 0 1 0 0 0 0
4 0 0 0 0 1 0 0 0 0
5 0 0 0 0 1 0 0 0 0
6 0 0 0 0 0 0 0 0 0
ABYSTAND Purchase
1 0 No
2 0 No
3 0 No
4 0 No
5 0 No
6 0 No
summary(Caravan$Purchase)
No Yes
5474 348
max(table(Caravan$Purchase))/sum(table(Caravan$Purchase)) # this is the baseline performance
[1] 0.9402267
We now split the observations into a test set, containing the first \(1000\) observations, and a training set, containing the remaining observations. We fit a KNN model on the training data using \(K = 1\), and evaluate its performance on the test data.
which(names(Caravan) == 'Purchase')
[1] 86
X <- scale(Caravan[,-which(names(Caravan) == 'Purchase')])
idx_test <- 1:1000
X_tr <- X[-idx_test,]
X_test <- X[idx_test,]
y_tr <- Caravan$Purchase[-idx_test]
y_test <- Caravan$Purchase[idx_test]
knn_pred <- knn(X_tr, X_test, y_tr, k = 1)
mean(y_test != knn_pred)
[1] 0.115
mean(y_test != "No")
[1] 0.059
table(knn_pred, y_test)
y_test
knn_pred No Yes
No 875 49
Yes 66 10
knn_pred <- knn(X_tr, X_test, y_tr, k = 3)
table(knn_pred, y_test)
y_test
knn_pred No Yes
No 921 54
Yes 20 5
knn_pred <- knn(X_tr, X_test, y_tr, k = 5)
table(knn_pred, y_test)
y_test
knn_pred No Yes
No 930 55
Yes 11 4